Final project R for Bio: Group 19

Ida Sofie Adrian (s243903), Mariam Dalia (s242707), Mathilde Melgaard (s243633), Nina Zomorrodnia (s243923), Victor Hansen (s243634)

Introduction & Aim

Introduction

  • Cervical cancer is a significant cause of mortality in low-income countries.

  • The dataset:

    • Medical records from 858 female patients.
    • Random sampling of patients between the years 2012 and 2013.
    • Gynecology service at Hospital Universitario de Caracas in Caracas, Venezuela.

Aim

  1. Find correlations between variables and cancer diagnosis
  2. Find correlations between different interesting variables
  • Chi-squared test:

    • To evaluate whether there is a significant association between two variables.
  • PCA:

    • To identify patterns and relationships in data.
  • Visualization:

    • To present the data using various plots.

Methods: Data cleaning

  • Cleaning NAs

  • Changing ? to NAs

  • Change values 1.0 and 0.0 to yes and no

  • Changing columns to numeric

  • Renaming columns for consistency

# Changing values 
data_raw <- data_raw |>
  mutate(
    Smokes = case_when(
      Smokes == "0.0" ~ "no",
      Smokes == "1.0" ~ "yes"),
    Hormonal.Contraceptives = case_when(
      Hormonal.Contraceptives == "1.0" ~ "yes",
      Hormonal.Contraceptives == "0.0" ~ "no"),
    IUD = case_when(
      IUD == "0.0" ~ "no",
      IUD == "1.0" ~ "yes"),
    Dx.Cancer = case_when(
      Dx.Cancer == "0" ~ "no",
      Dx.Cancer == "1" ~ "yes"))

# Tidying
data_clean <- data_raw |> 
  rename('Smokes.years'= Smokes..years.,
         'Smokes.packs.years' = Smokes..packs.year.) |> 
  rename_with(~ gsub("^STDs\\.\\.", "", .)) |>
  rename_with(~ str_remove(.,"\\.$")) |> #removes the '.' from the last word in columns
  rename_with(~ str_replace_all(., "\\.", "_")) |> 
  rename_with(~ str_replace_all(.x, "__", "_"))

Methods: Data augment

  • Creating IDs

  • Creating patient IDs

  • Pivot_long to create new STD column

  • Tidying names in rows

  • Creating new count column for STD

  • Changing the order of columns

# Changing data to long
data_long <- data_clean |>
  pivot_longer(cols = starts_with("STDs_"), 
               names_to = "STD_type", 
               values_to = "has_STD") |>
  mutate(
    STD = ifelse(has_STD == 1, STD_type, NA)  # Keep STD name where 1 is present) |>
  group_by(ID) |>
  mutate(
    # Concatenate STD names for each ID, if none, set "No"
    STD = ifelse(all(is.na(STD)), "No", paste(na.omit(STD), collapse = ", "))) |>
  ungroup() |>
  select(-STD_type, -has_STD) |>
  distinct() |> 
  separate_rows(STD, sep = ",")

# Creating new column
data_long <- data_long |> 
  group_by(ID) |> 
  mutate(Number_of_STDs = if_else(all(is.na(STD) | STD == "No"), 0, n_distinct(STD, na.rm = TRUE))) |> ungroup())

Results: Barplot

  • Few women have STDs, regardless of cervical cancer diagnosis status

  • Women with cancer: HPV seems to be the only STD present.

  • Women without cancer: A few patients with different types of STDs, but no cases with HPV specifically.

data_normalized_STD <- data_aug |>
  group_by(Dx_Cancer, STD) |>
  summarize(count = n(), .groups = "drop") |>
  group_by(Dx_Cancer) |>
  mutate(prop = count / sum(count))

data_normalized_STD <- data_normalized_STD |> mutate(STD = fct_reorder(STD, count, .desc = TRUE))

barplot_STD <- ggplot(data = data_normalized_STD, aes(x = STD, y = prop, fill = Dx_Cancer)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_grid(~Dx_Cancer) +
  labs(title = "Proportion of STD type by Cancer Diagnosis",
       x = "STD", y = "Proportion",
       fill = "Cancer Diagnosis",
       caption = "Source: https://doi.org/10.24432/C5Z310") +
  theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate text
        plot.margin = margin(10, 10, 10, 50)  # Increase the left margin)

barplot_STD

Results: Boxplot

Hypothesis: Women with first sexual intercourse at a young age tend to get STDs and later cervical cancer.

  • No correlation between number of sexual partners and cervical cancer diagnosis status.

  • Small correlation between the age of first sexual intercourse and cervical cancer diagnosis status:

Women with cervical cancer were older when they had first sexual intercourse compared to women without cancer.

  • Small correlation between the age of first sexual intercourse and cervical cancer diagnosis: Women who are diagnosed with cervical cancer were older when they had first sexual intercourse compared to women without cancer.

Hypothesis does not seem to be true.

Numb_sex <- ggplot(data = data_aug, aes(y = Dx_Cancer, x = Number_of_sexual_partners, fill = Dx_Cancer)) +
  geom_boxplot(show.legend = FALSE) +
   labs(y = "Cancer Diagnosis", x = "Number of Sexual Partners") +
  theme_bw()

first_sex_int <- ggplot(data = data_aug, aes(y = Dx_Cancer, x = First_sexual_intercourse, fill = Dx_Cancer)) +
  geom_boxplot(show.legend = FALSE) +
   labs(y = "Cancer Diagnosis", x = "Age of First Sexual Intercourse") +
  theme_bw()

boxplot_sex_his <- (Numb_sex / first_sex_int) +
  plot_annotation(
    title = "Comparison of Sexual History and Cancer Diagnosis",
    caption = "Source: https://doi.org/10.24432/C5Z310")

boxplot_sex_his

Results: Function for Correlation Visualization of Categorical Variables

create_proportional_barplot <- function(data, x_var, fill_var, 
                                        x_label = NULL, y_label = "Proportion", 
                                        fill_label = NULL, title = NULL) {
  if (is.null(x_label)) x_label <- x_var
  if (is.null(fill_label)) fill_label <- fill_var
  if (is.null(title)) title <- paste("Proportion of", fill_var, "by", x_var)

  filtered_data <- data %>%
    group_by(ID) %>%
    summarise(
      x_value = first(!!sym(x_var)),
      fill_value = first(!!sym(fill_var)),
      .groups = "drop"
    ) %>%
    filter(!is.na(x_value), !is.na(fill_value)) %>%
    mutate(fill_value = factor(fill_value))

  # Ensure color palette matches the levels of 'fill_value'
  color_palette <- setNames(
    c("lightblue", "darkred", "green", "orange")[1:length(levels(filtered_data$fill_value))],
    levels(filtered_data$fill_value))

  ggplot(filtered_data, aes(x = x_value, fill = fill_value)) +
    geom_bar(position = "fill") +
    labs(x = x_label, y = y_label, fill = fill_label, title = title) +
    scale_fill_manual(values = color_palette) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 0, hjust = 0.5))}

proportion_plot <- create_proportional_barplot(data_aug, x_var = "Dx_Cancer",
                                               fill_var = "Smokes")
ggsave("../results/images/05_proportion_plot.png", plot = proportion_plot)
  • Input: dataset and two variables, Output: stratified bar plot
  • Compare categorical variables such as: IUD, Hormonal Contraceptives, Smoking, Cancer diagnosis

Results: Function for contingency table and correlation score

Input: Data set and two variables

Output: Contingency table and chi-2

calculate_chi_squared <- function(data, var1, var2) {
  # Summarize the data so each ID has one row, taking the first occurrence of var1 and var2
  filtered_data <- data %>%
    group_by(ID) %>%
    summarise(
      var1_value = first(!!sym(var1)), var2_value = first(!!sym(var2)),
      .groups = "drop") %>%
    filter(!is.na(var1_value), !is.na(var2_value))  # Remove rows with NA values
  
  # Create the contingency table
  contingency_table <- filtered_data %>%
    count(var1_value, var2_value) %>%
    pivot_wider(names_from = var2_value, values_from = n, values_fill = 0) %>%
    column_to_rownames("var1_value") %>% as.matrix()

  # Perform the chi-squared test
  chisq_result <- chisq.test(contingency_table)
  
  return(list(contingency_table = contingency_table, chisq_result = chisq_result))}

calculate_chi_squared(data_aug, var1 = "Dx_Cancer", var2 = "Dx_HPV")

:::

Results: PCA

  • Numeric values
  • Scaling data
  • Two distinct clusters

cancer_data <- data_aug |> 
  select(-Time_since_first_diagnosis, 
         -Time_since_last_diagnosis, Dx_CIN) |> drop_na() 

pca_fit_cancer <- cancer_data |> 
  select(where(is.numeric)) |>  
  prcomp(scale = TRUE)

scatterplot_pca <- pca_fit_cancer |> 
  augment(cancer_data) |> 
  ggplot(aes(.fittedPC1, .fittedPC2, color = factor(Cancer))) + 
  geom_point(size = 1.5) +
  theme_half_open(12) + 
  background_grid() + 
  scale_color_discrete(labels = c("0" = "Negative", "1" = "Positive")) +
  labs(color = "Cervical Cancer Present") + 
  ggtitle("PCA Plot") + theme(plot.title = element_text(hjust = 0.5))

Results: PCA

  • Extract the rotation matrix

  • Arrows in similar directions = positive correlation

  • Arrows in opposite directions = negative correlation

  • Arrows near the origin = minimal contribution to the components

  • The length of an arrow = amount of contribution to the components

  • Extract Eigenvalues of the PCA

  • Represents the amount of variance explained by each principal component

  • PC4 and PC5 explains 50-60% of the variance

Conclusions

  • Correlation between:
    • HPV and cancer diagnosis
    • First sexual intercourse and cancer diagnosis
    • Age and cancer diagnosis